# Visibility Analysis Script Runner
# Alex F. Gazmararian
# agazmararian@gmail.com
#
# This script runs the complete visibility analysis pipeline
# Run this script from the project root directory
#
# Analysis examines how proximity to green energy projects affects
# public recognition and political credit attribution

# Ensure renv is activated before loading packages
if (!("renv" %in% loadedNamespaces())) {
  # Try to activate renv if not already loaded
  if (file.exists("renv/activate.R")) {
    source("renv/activate.R")
  } else if (file.exists(".Rprofile")) {
    source(".Rprofile")
  }
}

# Load required packages with error handling
tryCatch({
  library(here)
}, error = function(e) {
  message("Error loading 'here' package: ", e$message)
  message("Attempting to restore renv environment...")
  if ("renv" %in% installed.packages()[,"Package"]) {
    renv::restore()
    library(here)
  } else {
    stop("The 'here' package is not available. Please ensure renv is properly set up and run renv::restore()")
  }
})

# Load all project functions from R/ modules
source(here("R", "load_functions.R"))

# Configuration ----
message("=== VISIBILITY ANALYSIS SCRIPT RUNNER ===")
start_time <- Sys.time()

SKIP_RENV_CHECK <- getOption("visibility.skip_renv_check", FALSE)

# PNAS mode: when TRUE (default), only outputs for the paper are generated
# When FALSE, additional diagnostic/internal outputs are also generated
pnas <- getOption("pnas.mode", TRUE)

# Environment Validation ----
message("=== ENVIRONMENT VALIDATION ===")

r_version <- getRversion()
min_r_version <- "4.3.0"
if (r_version < min_r_version) {
  stop(sprintf("R version %s or higher required. Current version: %s", min_r_version, r_version))
}
message(sprintf("[OK] R version: %s", r_version))

essential_dirs <- c(
  here("data", "input"),
  here("data", "inter"),
  here("data", "output"))

missing_dirs <- essential_dirs[!sapply(essential_dirs, dir.exists)]
if (length(missing_dirs) > 0) {
  message("[INFO] Creating missing directories:")
  for (dir in missing_dirs) {
    dir.create(dir, recursive = TRUE, showWarnings = FALSE)
    message(sprintf("  [OK] Created: %s", dir))
  }
}
message("[OK] Essential data directory structure ready")

message("")
message("=== SYSTEM INFORMATION ===")
si <- Sys.info()
message(sprintf("Operating System: %s %s", si[["sysname"]], si[["release"]]))
message(sprintf("R Version: %s", R.version.string))
message(sprintf("Platform: %s", R.version$platform))

# Memory information
if (si[["sysname"]] != "Windows") {
  tryCatch({
    # Get memory info on Unix-like systems
    mem_info <- system("free -h 2>/dev/null || vm_stat 2>/dev/null || echo 'Memory info unavailable'", intern = TRUE)
    if (length(mem_info) > 1 && !grepl("unavailable", mem_info[1])) {
      message("Memory: Available")
    }
  }, error = function(e) {
    # Silent fallback
  })
}

message("")
message("[OK] Environment validation complete")
message("")

# Set up logging ----
log_dir <- here("output", "pnas", "log")
if (!dir.exists(log_dir)) {
  dir.create(log_dir, recursive = TRUE, showWarnings = FALSE)
}

# Check if running under unified PNAS pipeline logging
unified_log_file <- getOption("pnas.unified_log_file", NULL)
if (!is.null(unified_log_file)) {
  # Use the parent pipeline's log file
  log_file <- unified_log_file
} else {
  # Create own log file with timestamp
  log_timestamp <- format(start_time, "%Y%m%d_%H%M%S")
  log_file <- file.path(log_dir, paste0("visibility_analysis_", log_timestamp, ".log"))
}

run_script <- function(script_path, description) {
  log_message("Running: %s", description)
  log_message("Script: %s", script_path)
  
  start_script_time <- Sys.time()
  
  tryCatch({
    if (!file.exists(script_path)) {
      stop(sprintf("Script not found: %s", script_path))
    }
    
    source(script_path, local = FALSE)
    
    end_script_time <- Sys.time()
    elapsed <- as.numeric(difftime(end_script_time, start_script_time, units = "secs"))
    
    log_message("[OK] Completed: %s (%.1f seconds)", description, elapsed)
    log_message("")
    
    return(list(success = TRUE, error = FALSE))
    
  }, error = function(e) {
    log_message("[ERROR] in %s: %s", description, e$message)
    log_message("Script path: %s", script_path)
    log_message("")
    
    return(list(success = FALSE, error = TRUE, message = e$message, script = script_path))
  })
}

# Analysis Pipeline ----
log_message("Starting visibility analysis pipeline...")
log_message("Timestamp: %s", format(start_time, "%Y-%m-%d %H:%M:%S"))
log_message("")

# Track results
results <- list()

# Detect replication mode to determine cleanup strategy
# Check for geocoding cache (input) to determine if we CAN run in full mode
# Don't use init_replication_mode() here because the geo file (output) doesn't exist yet
geocoding_cache <- here("data", "cache", "geocoding", "zip_code_geoloc.csv")
if (file.exists(geocoding_cache)) {
  # Can run in full mode - will generate geo file
  REPLICATION_MODE <- "full"
  message("Replication mode: FULL (geocoding cache available)")
} else {
  # Must run in anonymized mode - need cached results
  REPLICATION_MODE <- "anonymized"
  message("Replication mode: ANONYMIZED (no geocoding cache)")
}

# Store mode in options for robust cross-script persistence
# This ensures child scripts don't re-detect the mode incorrectly
options(replication.mode = REPLICATION_MODE)

source(here("R", "visibility", "replication_mode.R"))

# Cleanup: Remove intermediate files ----
log_message("=== CLEANING INTERMEDIATE FILES ===")

if (REPLICATION_MODE == "anonymized") {
  # In anonymized mode, preserve cached intermediate files that cannot be regenerated
  # Only remove the final output file to ensure fresh analysis run
  log_message("Anonymized mode: preserving cached intermediate files")
  
  # Files that can be regenerated even in anonymized mode
  regenerable_files <- c(
    here("data", "output", "visibility_analysis.rds")
  )
  
  files_removed <- 0
  for (file_path in regenerable_files) {
    if (file.exists(file_path)) {
      log_message("Removing: %s", basename(file_path))
      file.remove(file_path)
      files_removed <- files_removed + 1
    }
  }
  
  log_message("Removed %d output files (preserved cached intermediates)", files_removed)
  
} else {
  # Full mode: clean all intermediate files for fresh run
  log_message("Full mode: cleaning all intermediate files")
  
  # Define intermediate files to remove for a clean run
  # NOTE: survey_visibility_processed.rds is now in data/cache/ and is regenerated
  # by process_survey.R, not deleted here
  intermediate_files <- c(
    # Core intermediate data files (geo file is in inter, base file is in cache)
    here("data", "inter", "survey_visibility_processed_with_geo.rds"),
    here("data", "inter", "bgm_processed.rds"),
    here("data", "inter", "eia_processed.rds"),
    here("data", "inter", "eia_survey_processed.rds"),
    
    # Statement processing files
    here("data", "inter", "statements_processed.csv"),
    here("data", "inter", "annotated_statements.csv"),
    
    # Final output files
    here("data", "output", "visibility_analysis.rds") 
  )
  
  # Remove files that exist
  files_removed <- 0
  for (file_path in intermediate_files) {
    if (file.exists(file_path)) {
      log_message("Removing: %s", basename(file_path))
      file.remove(file_path)
      files_removed <- files_removed + 1
    }
  }
  
  log_message("Removed %d intermediate files", files_removed)
}

# Ensure output directories exist after cleanup
output_dirs_to_create <- c(
  here("output", "pnas", "tables"),
  here("output", "pnas", "figures"),
  here("output", "pnas", "stats"),
  here("data", "inter"),
  here("data", "output"),
  here("data", "cache", "geocoding")
)

for (dir_path in output_dirs_to_create) {
  if (!dir.exists(dir_path)) {
    dir.create(dir_path, recursive = TRUE, showWarnings = FALSE)
    log_message("Created directory: %s", dir_path)
  }
}

log_message("")

# Phase 1: Data Processing ----
log_message("=== PHASE 1: DATA PROCESSING ===")

# Process survey data
results$process_survey <- run_script(
  script_path = here("analysis", "visibility", "processing", "process_survey.R"),
  description = "Process and geocode survey responses"
)

# Process BGM project data
results$process_bgm <- run_script(
  script_path = here("R", "covariates", "process_bgm.R"),
  description = "Process BGM investment database"
)

results$process_turner <- run_script(
  script_path = here("R", "covariates", "process_turner_data.R"),
  description = "Second stage of processing BGM investment data"
)

# Process all required covariates for merge step
results$broadband <- run_script(
  script_path = here("R", "covariates", "broadband.R"),
  description = "Process broadband coverage data"
)

results$bea <- run_script(
  script_path = here("R", "covariates", "bea.R"),
  description = "Process BEA economic data"
)

results$pres_elections <- run_script(
  script_path = here("R", "covariates", "pres_elec_2020.R"),
  description = "Process 2020 presidential election data"
)

results$electricity_prices <- run_script(
  script_path = here("R", "covariates", "electricity_prices.R"),
  description = "Process electricity price data"
)

results$highways <- run_script(
  script_path = here("R", "covariates", "highways.R"),
  description = "Process highway access data"
)

results$unemployment <- run_script(
  script_path = here("R", "covariates", "unemployment.R"),
  description = "Process unemployment data"
)

results$unions <- run_script(
  script_path = here("R", "covariates", "unions.R"),
  description = "Process union membership data"
)

results$census <- run_script(
  script_path = here("R", "covariates", "census.R"),
  description = "Process Census/ACS demographic data"
)

results$dma <- run_script(
  script_path = here("R", "covariates", "dma.R"),
  description = "Process DMA (Designated Market Areas) data"
)

# Download and process EIA data
results$download_eia <- run_script(
  script_path = here("analysis", "visibility", "processing", "process_eia", "download_eia.R"),
  description = "Download EIA-860M generator data"
)

results$process_eia <- run_script(
  script_path = here("analysis", "visibility", "processing", "process_eia", "process_eia.R"),
  description = "Process EIA generator inventory"
)

# Download geocoordinates if not already present
results$download_geocoordinates <- run_script(
  script_path = here("analysis", "visibility", "processing", "geocode_zip.R"),
  description = "Download geocoordinates"
)

# Add geographic identifiers to survey data
results$process_geo_identifiers <- run_script(
  script_path = here("analysis", "visibility", "processing", "process_geo_identifiers.R"),
  description = "Add geographic identifiers (FIPS codes) to survey data"
)

# Calculate population density measures
results$popdensity <- run_script(
  script_path = here("analysis", "visibility", "processing", "calculate_population_density.R"),
  description = "Calculate population density measures"
)

# Process statement data (required for project distance calculations)
results$process_statements <- run_script(
  script_path = here("R", "annotation", "process.R"),
  description = "Process raw statement data"
)

# Annotate statements with GPT (required for project distance calculations)
results$annotate_statements <- run_script(
  script_path = here("R", "annotation", "annotate.R"),
  description = "GPT-based statement annotation"
)

# Calculate distances to projects
results$project_distance <- run_script(
  script_path = here("analysis", "visibility", "processing", "project_distance.R"),
  description = "Calculate distances to energy projects"
)

# Phase 2: Data Integration ----
log_message("=== PHASE 2: DATA INTEGRATION ===")

# Merge all data sources
results$merge <- run_script(
  script_path = here("analysis", "visibility", "processing", "merge.R"),
  description = "Integrate survey, project, and geographic data"
)

# Phase 4: Statistical Analysis ----
log_message("=== PHASE 4: ANALYSIS ===")

# Main proximity analysis
results$proximity_analysis <- run_script(
  script_path = here("analysis", "visibility", "analysis", "proximity_analysis.R"),
  description = "Main proximity effects analysis"
)

results$proximity_heterogeneity <- run_script(
  script_path = here("analysis", "visibility", "analysis", "proximity_heterogeneity.R"),
  description = "Heterogeneous proximity effects analysis"
)

# Power analysis
results$power_analysis <- run_script(
  script_path = here("analysis", "visibility", "analysis", "power_analysis.R"),
  description = "Power analysis"
)

# Credit attribution analysis
results$perceived_credit <- run_script(
  script_path = here("analysis", "visibility", "analysis", "perceived_credit.R"),
  description = "Political credit attribution analysis"
)

# Perceived benefits analysis
results$perceived_benefits <- run_script(
  script_path = here("analysis", "visibility", "analysis", "perceived_benefits.R"),
  description = "Perceived community benefits analysis"
)

# Summary statistics
results$summary_stats <- run_script(
  script_path = here("analysis", "visibility", "analysis", "summary_statistics.R"),
  description = "Generate descriptive statistics"
)

# Results 
results$build_fig2 <- run_script(
  script_path = here("analysis", "visibility", "analysis", "build_fig2.R"),
  description = "Build Fig. 2"
)

# Data distribution
results$data_distribution <- run_script(
  script_path = here("analysis", "visibility", "analysis", "variation_respondent_distance.R"),
  description = "Analyze within-state variation"
)

# Distance distribution analysis
results$distance_dist <- run_script(
  script_path = here("analysis", "visibility", "analysis", "visualize_project_distance.R"),
  description = "Create Fig. 1"
)

# Supplementary figures for EIA and green manufacturing data
results$fig_s16 <- run_script(
  script_path = here("analysis", "visibility", "analysis", "fig_S16_eia_capacity.R"),
  description = "Create Fig. S16 (EIA capacity by technology)"
)

results$fig_s17 <- run_script(
  script_path = here("analysis", "visibility", "analysis", "green_mfg_plot.R"),
  description = "Create Fig. S17 (green manufacturing announcements)"
)

# Weights diagnosis
results$weights <- run_script(
  script_path = here("analysis", "visibility", "processing", "weights_diagnosis.R"),
  description = "Calculate weights"
)

# Summary ----
end_time <- Sys.time()
total_elapsed <- as.numeric(difftime(end_time, start_time, units = "mins"))

log_message("=== PIPELINE COMPLETE ===")
log_message("Total runtime: %.1f minutes", total_elapsed)
log_message("Start time: %s", format(start_time, "%Y-%m-%d %H:%M:%S"))
log_message("End time: %s", format(end_time, "%Y-%m-%d %H:%M:%S"))

# Final system information summary
log_message("")
log_message("=== REPRODUCIBILITY INFORMATION ===")
log_message("R Version: %s", R.version.string)
log_message("Platform: %s", R.version$platform)
log_message("System: %s %s", Sys.info()[["sysname"]], Sys.info()[["release"]])
log_message("Working Directory: %s", getwd())

# Memory usage at completion
tryCatch({
  if (Sys.info()[["sysname"]] == "Darwin") {
    # macOS memory info
    mem_pressure <- system("memory_pressure 2>/dev/null | head -1", intern = TRUE)
    if (length(mem_pressure) > 0 && !grepl("command not found", mem_pressure)) {
      log_message("Memory Status: %s", mem_pressure)
    }
  } else if (Sys.info()[["sysname"]] == "Linux") {
    # Linux memory info
    mem_info <- system("free -h | head -2", intern = TRUE)
    if (length(mem_info) >= 2) {
      log_message("Memory Usage: %s", gsub("\\s+", " ", mem_info[2]))
    }
  }
}, error = function(e) {
  # Silent fallback - memory info is nice-to-have
})

# R session information
log_message("Locale: %s", Sys.getlocale("LC_CTYPE"))
log_message("Timezone: %s", Sys.timezone())

# Package information for key dependencies
key_packages_info <- c("here", "dplyr", "sf", "ggplot2", "renv")
existing_packages <- key_packages_info[key_packages_info %in% installed.packages()[,"Package"]]
if (length(existing_packages) > 0) {
  log_message("Key package versions:")
  for (pkg in existing_packages) {
    version <- packageVersion(pkg)
    log_message("  %s: %s", pkg, version)
  }
}

# renv status if available
if ("renv" %in% installed.packages()[,"Package"]) {
  tryCatch({
    if (file.exists("renv.lock")) {
      log_message("renv project detected: %s", if(renv::project() == getwd()) getwd() else "renv.lock found")
      
      # Get lockfile hash for reproducibility tracking
      lockfile_path <- file.path(getwd(), "renv.lock")
      if (file.exists(lockfile_path)) {
        lockfile_hash <- tools::md5sum(lockfile_path)
        log_message("renv.lock hash: %s", lockfile_hash)
      }
      
      # Final sync status check
      final_status <- tryCatch({
        status_output <- capture.output(suppressMessages(renv::status()), type = "message")
        if (any(grepl("inconsistent|not installed|out of sync", status_output, ignore.case = TRUE))) {
          "packages out of sync"
        } else {
          "packages synchronized"
        }
      }, error = function(e) {
        "status check failed"
      })
      log_message("renv final status: %s", final_status)
    } else {
      log_message("renv: no lockfile found")
    }
  }, error = function(e) {
    log_message("renv status: error checking - %s", e$message)
  })
} else {
  log_message("renv: not installed")
}

# Check for errors - use safer access with explicit checks
errors <- purrr::keep(results, ~ is.list(.) && "error" %in% names(.) && isTRUE(.$error))
if (length(errors) > 0) {
  log_message("")
  log_message("[ERROR] ERRORS ENCOUNTERED:")
  for (i in seq_along(errors)) {
    error_info <- errors[[i]]
    log_message("  %s: %s", error_info$script, error_info$message)
  }
  log_message("")
  log_message("Check log file for details: %s", log_file)
} else {
  log_message("[OK] All scripts completed successfully!")
}

log_message("")
log_message("Log file: %s", log_file)
log_message("Results saved to: %s", here("data", "output"))
log_message("Tables and figures in: %s", here("output"))

# Display key outputs
log_message("")
log_message("=== KEY OUTPUTS ===")
log_message("Main dataset: %s", here("data", "output", "visibility_analysis.rds"))
log_message("Tables: %s", here("output", "pnas", "tables"))
log_message("Figures: %s", here("output", "pnas", "figures"))
log_message("")
log_message("Analysis complete! Check output directories for results.")
